# =============================================================================
# =============================================================================
# Safegraph Analysis
# This code is for replicating: Figure 3 and Appendix Figure A1.
# 
#       We want to study the first stage: how many visits and visitors are coming to plasma centers 1) that newly open, 2) have been open for a while.
#       We also want to study if there is a dip in visits after the stimulus checks during covid
# =============================================================================
# =============================================================================

afolder = ''
geo_year = 2019
geo = ts['geog']
ssuf = 'PlasmaEst'

# =============================================================================
# Map Plasma centers from Safegraph to my Plasma Center list.
#       While core has a variable that identifies plasma centers, it doesn't contain the specific plasma center id (mfeiu).
#       We merge the two datasets to know the precise plasma center age (from pcp)
# =============================================================================
ptm = pd.read_parquet(cdd['p_d_sg_sample'] + afolder + r'\pattern_cleaned_sample.parquet')
core = pd.read_parquet(cdd['p_d_sg_sample']+r'\core_cleaned_sample.parquet')
pcp = pd.read_parquet(cdd['p_d_pcp'] + r'/pcp.parquet')
core['plasma'] = 1 #The authors added a variable to core that identified plasma centers
pcp_sg = core[core.plasma==1] 
pcp_sg['open'] = pcp_sg['placekey'].map(ptm.groupby('placekey')['date_e'].min())
norm = pd.read_parquet(cdd['p_d_sg_sample'] + r'\norm_cleaned_sample.parquet')


#Look for the closest plasma center in safegraph
pcp_ber=pcp.copy()
t=pcp_sg[['placekey','lat','long','open']].to_dict(orient='records')
pcp_ber['dist_l'] = pcp_ber.apply(lambda x: [{**v,**{'distance':vinc.vincenty_inverse((v['lat'],v['long']),(x['clat'],x['clong'])).m}} for v in t] ,axis=1)
pcp_ber_e = pcp_ber.explode('dist_l')
pcp_ber_e['placekey'] = pcp_ber_e['dist_l'].apply(lambda x: x['placekey'])
pcp_ber_e['open_sg'] = pcp_ber_e['dist_l'].apply(lambda x: x['open'])
pcp_ber_e['dist'] = pcp_ber_e['dist_l'].apply(lambda x: x['distance'])
pcp_ber_e['dist_r'] = 1
pcp_ber_e = pcp_ber_e.sort_values(by=['mfeiu','dist'])
pcp_ber_e['dist_r'] = pcp_ber_e.groupby('mfeiu')['dist_r'].cumsum()


#For each mfeiu use the best match and show summary statistics on the distance.
pcp_ber_ess = pcp_ber_e[(pcp_ber_e['close'].dt.year > 2016)].copy()
pcp_ber_ess = pd.merge(pcp_ber_ess[['mfei','mfeiu','open','close','open_sg','name_app','name_elgl','name_ref','address','city','zip','dist','dist_r','sgid','placekey']], pcp_sg[['sgid','name','address','city','zip']],how='left',on='sgid',suffixes=('_ber','_sg'))
pcp_ber_ess = pd.merge(pcp_ber_ess,pcp[['mfeiu','gap','gdate_moved','gdate_p','gdate_np']],how='left',on='mfeiu')
#Choose the closest establishment that is tracked by Safegraph which is within 1km.
pcp_ber_ess['dist_r'] = pcp_ber_ess['dist_r'] + pd.isna(pcp_ber_ess['open_sg'])*10000
pcp_ber_ess['selected'] = 1*(pcp_ber_ess.dist_r == pcp_ber_ess.groupby('mfeiu')['dist_r'].transform('min'))*(pcp_ber_ess['dist']<250)

pcpm=pcp_ber_ess[pcp_ber_ess.selected==1].copy()
pcpm['date_del'] = (pcpm['open_sg'] - pcpm['open']).dt.days / 365
pcpm['match_n'] = pcpm.groupby('placekey')['mfeiu'].transform('count')
pcpm = pcpm[(pcpm.dist==pcpm.groupby('placekey')['dist'].transform('min'))]
pcpm = pcpm[(pcpm.open==pcpm.groupby('placekey')['open'].transform('max'))]
pcpm = pcpm[pcpm.mfei>10000]

# =============================================================================
# Prep the Safegraph pattern dataset of plasma centers for analysis.
# =============================================================================

ptm['open'] = ptm['placekey'].map(pcpm.set_index('placekey')['open'])
ptm['mfeiu'] = ptm['placekey'].map(pcpm.set_index('placekey')['mfeiu'])
ptm = ptm.sort_values(by=['placekey','date_e'])
def rmonths(dt_act,dt_base):
    return 12*(dt_act.dt.year - dt_base.dt.year) + (dt_act.dt.month - dt_base.dt.month)
ptm['etime'] = rmonths(ptm['date_e'],ptm['open'])
ptm['etimeq'] = np.floor(ptm['etime'] / 3)
ptm['stateym'] = np.floor(ptm['cbg'] / 10000000000) * 1000000 + ptm['date_e'].dt.year * 100 + ptm['date_e'].dt.month
ptm['nfactor'] = ptm['stateym'].map(norm.groupby('stateym')['nfactor'].mean())
ptm['visitsn'] = ptm['visits'] * ptm['nfactor']
ptm['visitorsn'] = ptm['visitors'] * ptm['nfactor']
ptm['dt'] = ptm['date_e'].dt.date
ptm['dts'] = ptm['date_e'].dt.to_period('M').dt.to_timestamp().dt.date
ptm['dtm'] = (ptm['date_e'].dt.to_period('M').dt.to_timestamp() + dt.timedelta(days=14)).dt.date


#Liquidity Shocks Dates
# • Law enacted mid March 2020 (1st checks Apr 15, 2020): Up to $1,200 per adult, $500 per child (reduced by AGI> $75k) ($271 B)
# • Law enacted late December 2020 (1st checks Jan 4, 2021): Up to $600 per adult, $600 per child (reduced by AGI> $75k) ($141 B)
# • Law enacted early March 2021 (1st checks Mar 17 2021): Up to $1,400 per adult, $1,400 per child + a later payment after receiving 2020 tax return (which is hard to time) (reduced by AGI> $75k) ($401 B)
# • 1st round of the child tax credit when out Jun 15 2021
# • First checks in accounts is determined by observing Chase checking accounts ([Greig2021]) 
lsd = {'Check 1 ($271B)':dt.datetime(2020,4,15), 'Check 2 ($141B)':dt.datetime(2021,1,4),'Check 3':dt.datetime(2021,3,17),'Check 4':dt.datetime(2021,6,15)}
ptm['lsd'] = 1*(ptm['date_e'] == dt.datetime(2020,4,30)) + 2*(ptm['date_e'] == dt.datetime(2021,1,31)) + 3*(ptm['date_e'] == dt.datetime(2021,3,31)) + 4*(ptm['date_e'] == dt.datetime(2021,6,30)) 
ptm['lsd_s'] = ptm['lsd'] + 1*(ptm['date_e'] == dt.datetime(2020,3,31)) + 2*(ptm['date_e'] == dt.datetime(2020,12,31)) + 3*(ptm['date_e'] == dt.datetime(2021,2,28)) + 4*(ptm['date_e'] == dt.datetime(2021,5,31))
ptm['lsd_b'] = 1*(ptm['lsd'] > 0)

# Mark establishments that open from between Q3 2018 and Q2 2020.
ptm['open_18q3_20q2'] = 1*(ptm.placekey.isin(pcpm[(pcpm.open.dt.date>=dt.datetime(2018,7,1).date())&(pcpm.open.dt.date<dt.datetime(2020,9,1).date())]['placekey'].tolist()))
ptm['open_18q3_19q4'] = 1*(ptm.placekey.isin(pcpm[(pcpm.open.dt.date>=dt.datetime(2018,7,1).date())&(pcpm.open.dt.date<dt.datetime(2020,1,1).date())]['placekey'].tolist()))
ptm['open_lte18q4'] = 1*(ptm.placekey.isin(pcpm[(pcpm.open.dt.date<=dt.datetime(2019,1,1).date())]['placekey'].tolist()))

# Add the 75th pct of establishment visits and visitors by placekey as columns (for subsetting)
ptm['placekey_visitsn_p25'] = ptm['placekey'].map(ptm.groupby('placekey')['visitsn'].quantile(q=0.25))
ptm['placekey_visitsn_p50'] = ptm['placekey'].map(ptm.groupby('placekey')['visitsn'].quantile(q=0.50))
ptm['placekey_visitsn_p75'] = ptm['placekey'].map(ptm.groupby('placekey')['visitsn'].quantile(q=0.75))
ptm['placekey_visitorsn_p25'] = ptm['placekey'].map(ptm.groupby('placekey')['visitorsn'].quantile(q=0.25))
ptm['placekey_visitorsn_p50'] = ptm['placekey'].map(ptm.groupby('placekey')['visitorsn'].quantile(q=0.50))
ptm['placekey_visitorsn_p75'] = ptm['placekey'].map(ptm.groupby('placekey')['visitorsn'].quantile(q=0.75))

#Define a variable with the sample used for analysis of safegraph in the paper.
ptm['samp_0118_0521'] = 1*(ptm['date_e'].dt.date < dt.datetime(2021,6,1).date())

ptm['m'] = ptm['date_e'].dt.month
ptm['y'] = ptm['date_e'].dt.year

#Export the dataset for plotting and analysis in R.
ptm = ptm.drop_duplicates(subset=['placekey','dt'])
ptm.to_parquet(cdd['p_d_sg_pva'] + afolder + r'\ptm_aug_'+ssuf+'.parquet')































